#Importe de librerías
#Manejo de datos
import pandas as pd
import seaborn as sns
import numpy as np
import collections, os
from collections import Counter
from pandas_profiling import ProfileReport
from functools import reduce
#Progress bar
from tqdm import tqdm
import re, unicodedata, spacy
#, es_core_news_sm
#Directory listing
from os import listdir
from os.path import isfile, join
#Texto
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from sklearn.preprocessing import MinMaxScaler
from difflib import SequenceMatcher
#Fecha
import dateparser
from datetime import datetime
sns.set()
date_fmt = '%b %Y'
#Graficación
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt
def txt_preproc(input_str):
'''
Función para preprocesar cadenas de texto
Tomado/basado en:
https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string
https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
Recibe una cadena de texto en formato string llamado ``input_str``
Primero remueve caracteres especiales
Remueve acentos y caracteres unicode
Finalmente con el método .to_lower() convierte los caracteres a minúscula
Args:
input_str (string): Cadena de texto a preprocesar
Returns:
nfkd_form (string): Cadena de texto preprocesada.
'''
nfkd_form = re.sub(r'[?|$|.|!]',r'',input_str) #Remove special chars
nfkd_form = unicodedata.normalize('NFKD', nfkd_form) #Remove accents
nfkd_form = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) #Remove unicode chars
nfkd_form = (re.sub(r'[^a-zA-Z0-9 ]',r'',nfkd_form)).lower() #Remove missing special chars and convert to lowercase
return nfkd_form
mypath = './dataset/soa_sports_data.csv'
soa_df = pd.read_csv(mypath, encoding='latin1', error_bad_lines=False, low_memory=False)
soa_df.columns = [x.lower() for x in soa_df.columns]
soa_df.head()
| query | año | citas | type | titulo | abstract | revista_or_evento | a_1 | a_2 | a_3 | ... | a_5 | a_6 | a_7 | a_8 | a_9 | a_10 | a_11 | a_12 | a_13 | a_14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | "soccer" and "analytics" and "world" | 2014 | 89 | Article | Feature-driven visual analytics of soccer data | Soccer is one the most popular sports today an... | 2014 IEEE conference on visual analytics scien... | Sacha, D | Stein, M. | Schreck, T | ... | Deussen, O. | - | - | - | - | - | - | - | - | - |
| 1 | "soccer" and "analytics" and "world" | 2015 | 35 | Article | Visual Soccer Analytics: Understanding the Cha... | With recent advances in sensor technologies, l... | ISPRS International Journal of Geo-Information | Stein, M. | Häußler, J. | Jäckle, D. | ... | Schreck, T. | Keim, D. | - | - | - | - | - | - | - | - |
| 2 | "soccer" and "analytics" and "world" | 2013 | 35 | Thesis | Machine Learning for Soccer Analytics | Sports analytics has been successfully applied... | University of Leuven | Kumar, G. | - | - | ... | - | - | - | - | - | - | - | - | - | - |
| 3 | "soccer" and "analytics" and "world" | 2014 | 64 | Article | Bagadus: An integrated real-time system for so... | The importance of winning has increased the ro... | ACM Transactions on Multimedia Computing, Comm... | Stensland, H. K. | Gaddam, V. R | Tennøe, M. | ... | Næss, M. | Alstad, H. K. | Mortensen, A. | Langseth, R. | Ljødal, S. | Landsverk, Ø. | Griwodz, C. | Halvorsen, P. | Stenhaug, M. | Johansen, D. |
| 4 | "soccer" and "analytics" and "world" | 2014 | 0 | Article | World Cup sets US live streaming record | The world cup has become the most streamed liv... | FGV Digital Respository | Bond, S. | - | - | ... | - | - | - | - | - | - | - | - | - | - |
5 rows × 21 columns
soa_df.insert(7, 'titulo_proc', None)
soa_df.insert(8, 'abstract_proc', None)
soa_df.insert(9, 'roe_proc', None)
soa_df.insert(10, 'len_titulo', None)
soa_df.insert(11, 'len_abstract', None)
#Preprocessing text
for row in tqdm(range(len(soa_df))):
df_title = soa_df['titulo'].iloc[row]
df_abstract = soa_df['abstract'].iloc[row]
df_roe = soa_df['revista_or_evento'].iloc[row]
len_title = len(df_title.split(" "))
len_abstract = len(df_abstract.split(" "))
df_title = txt_preproc(df_title)
df_abstract = txt_preproc(df_abstract)
df_roe = txt_preproc(df_roe)
#============================================
soa_df['titulo_proc'].iloc[row] = df_title
soa_df['abstract_proc'].iloc[row] = df_abstract
soa_df['roe_proc'].iloc[row] = df_roe
soa_df['len_titulo'].iloc[row] = len_title
soa_df['len_abstract'].iloc[row] = len_abstract
0%| | 0/102 [00:00<?, ?it/s]C:\Users\user\anaconda3\envs\py38dane\lib\site-packages\pandas\core\indexing.py:1637: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._setitem_single_block(indexer, value, name) 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:00<00:00, 552.74it/s]
soa_df.head()
| query | año | citas | type | titulo | abstract | revista_or_evento | titulo_proc | abstract_proc | roe_proc | ... | a_5 | a_6 | a_7 | a_8 | a_9 | a_10 | a_11 | a_12 | a_13 | a_14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | "soccer" and "analytics" and "world" | 2014 | 89 | Article | Feature-driven visual analytics of soccer data | Soccer is one the most popular sports today an... | 2014 IEEE conference on visual analytics scien... | featuredriven visual analytics of soccer data | soccer is one the most popular sports today an... | 2014 ieee conference on visual analytics scien... | ... | Deussen, O. | - | - | - | - | - | - | - | - | - |
| 1 | "soccer" and "analytics" and "world" | 2015 | 35 | Article | Visual Soccer Analytics: Understanding the Cha... | With recent advances in sensor technologies, l... | ISPRS International Journal of Geo-Information | visual soccer analytics understanding the char... | with recent advances in sensor technologies la... | isprs international journal of geoinformation | ... | Schreck, T. | Keim, D. | - | - | - | - | - | - | - | - |
| 2 | "soccer" and "analytics" and "world" | 2013 | 35 | Thesis | Machine Learning for Soccer Analytics | Sports analytics has been successfully applied... | University of Leuven | machine learning for soccer analytics | sports analytics has been successfully applied... | university of leuven | ... | - | - | - | - | - | - | - | - | - | - |
| 3 | "soccer" and "analytics" and "world" | 2014 | 64 | Article | Bagadus: An integrated real-time system for so... | The importance of winning has increased the ro... | ACM Transactions on Multimedia Computing, Comm... | bagadus an integrated realtime system for socc... | the importance of winning has increased the ro... | acm transactions on multimedia computing commu... | ... | Næss, M. | Alstad, H. K. | Mortensen, A. | Langseth, R. | Ljødal, S. | Landsverk, Ø. | Griwodz, C. | Halvorsen, P. | Stenhaug, M. | Johansen, D. |
| 4 | "soccer" and "analytics" and "world" | 2014 | 0 | Article | World Cup sets US live streaming record | The world cup has become the most streamed liv... | FGV Digital Respository | world cup sets us live streaming record | the world cup has become the most streamed liv... | fgv digital respository | ... | - | - | - | - | - | - | - | - | - | - |
5 rows × 26 columns
soa_df.to_csv('./dataset/soa_sports_data_proc.csv', index=False)
all_year_base_gb = soa_df.groupby(["año"]).count()
all_year_base_gb = all_year_base_gb.reset_index()
all_year_base_gb = all_year_base_gb[["año", "query"]]
all_year_base_gb = all_year_base_gb.rename(columns={'año': 'year', 'query': 'count'})
all_year_base_gb = all_year_base_gb.sort_values(by=['year'])
all_year_base_gb
| year | count | |
|---|---|---|
| 0 | 2013 | 9 |
| 1 | 2014 | 26 |
| 2 | 2015 | 26 |
| 3 | 2016 | 9 |
| 4 | 2017 | 11 |
| 5 | 2018 | 11 |
| 6 | 2019 | 5 |
| 7 | 2020 | 4 |
| 8 | 2021 | 1 |
year_labels = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
year_values = [int(all_year_base_gb[all_year_base_gb['year']==year_labels[0]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[1]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[2]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[3]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[4]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[5]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[6]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[7]]['count']),
int(all_year_base_gb[all_year_base_gb['year']==year_labels[8]]['count'])]
fig = go.Figure(data=[go.Pie(labels=year_labels, values=year_values, hole=.3)])
fig.update_layout(
title_text="2013 - 2021: Annual Production, Total %",
legend_title="Year",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Prod %', x=0.5, y=0.5, font_size=13, showarrow=False)],
autosize=False,
width=950,
height=400,
margin=dict(l=0, r=0, t=50, b=0),
font=dict(size=12))
fig.show()
all_year_base_pt_gb = soa_df.groupby(['año', 'type']).count()
all_year_base_pt_gb = all_year_base_pt_gb.reset_index()
all_year_base_pt_gb = all_year_base_pt_gb[["año", "type", "query"]]
all_year_base_pt_gb = all_year_base_pt_gb.rename(columns={'año': 'year', 'query': 'count'})
all_year_base_pt_gb = all_year_base_pt_gb.sort_values(by=['year'])
all_year_base_pt_gb_arts = all_year_base_pt_gb[all_year_base_pt_gb['type']=='Article']
all_year_base_pt_gb_thes = all_year_base_pt_gb[all_year_base_pt_gb['type']=='Thesis']
all_year_base_pt_gb_book = all_year_base_pt_gb[all_year_base_pt_gb['type']=='Book']
all_year_base_pt_gb
| year | type | count | |
|---|---|---|---|
| 0 | 2013 | Article | 7 |
| 1 | 2013 | Book | 1 |
| 2 | 2013 | Thesis | 1 |
| 3 | 2014 | Article | 20 |
| 4 | 2014 | Book | 5 |
| 5 | 2014 | Thesis | 1 |
| 7 | 2015 | Book | 2 |
| 8 | 2015 | Thesis | 3 |
| 6 | 2015 | Article | 21 |
| 9 | 2016 | Article | 9 |
| 10 | 2017 | Article | 11 |
| 11 | 2018 | Article | 11 |
| 12 | 2019 | Article | 1 |
| 13 | 2019 | Book | 1 |
| 14 | 2019 | article | 3 |
| 15 | 2020 | Article | 4 |
| 16 | 2021 | Article | 1 |
#animals=['giraffes', 'orangutans', 'monkeys']
year_labels = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
fig = go.Figure(data=[go.Bar(name='Articles', x=year_labels, y=list(all_year_base_pt_gb_arts['count']), width=0.3),
go.Bar(name='Thesis', x=year_labels, y=list(all_year_base_pt_gb_thes['count']), width=0.3),
go.Bar(name='Books', x=year_labels, y=list(all_year_base_pt_gb_book['count']), width=0.3)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.update_layout(
title_text="2013 - 2021: Annual Production by Type",
xaxis_title="Year",
yaxis_title="Total",
legend_title="Type",
autosize=False,
width=950,
height=400,
margin=dict(l=0, r=0, t=50, b=0),
font=dict(size=11))
fig.show()
all_year_base_cites_gb = soa_df.groupby(['año', 'citas']).count()
all_year_base_cites_gb = all_year_base_cites_gb.reset_index()
all_year_base_cites_gb = all_year_base_cites_gb[["año", "citas"]]
all_year_base_cites_gb = all_year_base_cites_gb.rename(columns={'año': 'year', 'citas': 'cites'})
all_year_base_cites_gb.head()
| year | cites | |
|---|---|---|
| 0 | 2013 | 6 |
| 1 | 2013 | 8 |
| 2 | 2013 | 18 |
| 3 | 2013 | 33 |
| 4 | 2013 | 35 |
df_cites_sum = all_year_base_cites_gb.groupby(['year']).sum().reset_index().rename(columns={'cites': 'tot'})
df_cites_mean = all_year_base_cites_gb.groupby(['year']).mean().reset_index().rename(columns={'cites': 'mean_val'})
df_cites_median = all_year_base_cites_gb.groupby(['year']).median().reset_index().rename(columns={'cites': 'median_val'})
df_cites_max = all_year_base_cites_gb.groupby(['year']).max().reset_index().rename(columns={'cites': 'max_val'})
df_cites_min = all_year_base_cites_gb.groupby(['year']).min().reset_index().rename(columns={'cites': 'min_val'})
cites_data_frames = [df_cites_sum, df_cites_mean, df_cites_median, df_cites_max, df_cites_min]
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['year'],
how='outer'), cites_data_frames)
df_merged
| year | tot | mean_val | median_val | max_val | min_val | |
|---|---|---|---|---|---|---|
| 0 | 2013 | 1874 | 208.222222 | 35.0 | 1177 | 6 |
| 1 | 2014 | 750 | 41.666667 | 26.5 | 177 | 0 |
| 2 | 2015 | 532 | 26.600000 | 22.5 | 77 | 0 |
| 3 | 2016 | 746 | 93.250000 | 39.5 | 306 | 2 |
| 4 | 2017 | 156 | 19.500000 | 16.5 | 54 | 5 |
| 5 | 2018 | 202 | 22.444444 | 24.0 | 43 | 1 |
| 6 | 2019 | 109 | 21.800000 | 12.0 | 80 | 1 |
| 7 | 2020 | 22 | 7.333333 | 3.0 | 19 | 0 |
| 8 | 2021 | 1 | 1.000000 | 1.0 | 1 | 1 |
fig = go.Figure()
year_labels = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
fig.add_trace(go.Bar(x=year_labels,y=list(df_merged.tot), width=0.3, name='Total'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_merged.mean_val), name='Mean'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_merged.median_val), name='Median'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_merged.max_val), name='Max'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_merged.min_val), name='Min'))
fig.update_layout(
title_text="2013 - 2021: Citations stats",
xaxis_title="Year",
yaxis_title="Total",
legend_title="Stats",
autosize=False,
width=950,
height=400,
margin=dict(l=0, r=0, t=50, b=0),
font=dict(size=11))
fig.show()
#Año, citas, len de title, len de abstract
all_year_base_tot_gb = soa_df.groupby(['año', 'citas'], as_index=False).sum()
all_year_base_tot_gb = all_year_base_tot_gb[['año', 'citas', 'len_titulo', 'len_abstract']]
all_year_base_tot_gb = all_year_base_tot_gb.rename(columns={'año': 'year', 'citas': 'cites'})
all_year_base_tot_gb.head()
| year | cites | len_titulo | len_abstract | |
|---|---|---|---|---|
| 0 | 2013 | 6 | 8 | 1 |
| 1 | 2013 | 8 | 8 | 188 |
| 2 | 2013 | 18 | 10 | 158 |
| 3 | 2013 | 33 | 3 | 114 |
| 4 | 2013 | 35 | 5 | 406 |
fig = go.Figure(data=[go.Scatter3d(
x=all_year_base_tot_gb.len_abstract,
y=all_year_base_tot_gb.len_titulo,
z=all_year_base_tot_gb.cites,
name= 'Color scale',
mode='markers',
marker=dict(
size=8,
color=all_year_base_tot_gb.year, # set color to an array/list of desired values
#colorscale='Viridis', # choose a colorscale
opacity=0.8,
showscale=True,
colorbar = dict(len=0.8)
)
)])
# tight layout
#fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.update_layout(scene = dict(
xaxis_title='Abstract lenght',
yaxis_title='Title lenght',
zaxis_title='Cites'),
width=700,
margin=dict(r=0, b=0, l=0, t=0))
fig.update_layout(showlegend=True)
#fig.update_traces(legendgroup=all_year_base_tot_gb.year)
fig.update_traces(legendgroup=8)
fig.show()
title_words = ''
title_list_full, title_list_filtered = [], []
# iterate through the csv file
for val in soa_df.titulo_proc:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
for tok in tokens:
title_list_full.append(tok)
if tok not in stopwords:
title_list_filtered.append(tok)
title_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 1000, height = 400,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(title_words)
# plot the WordCloud image
plt.figure(figsize = (9, 9), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
sorted_tc_dict = sorted(dict(Counter(title_list_filtered)).items(), key=lambda kv: kv[1])
title_array = np.asarray(sorted_tc_dict)
labels = np.flipud(title_array[:,0])
values = np.flipud(title_array[:,1]).astype(int)
data = [go.Bar(
x = labels[:100],
y = values[:100]
)]
fig = go.Figure(data=data)
fig.update_layout(xaxis_tickangle=-45)
fig.update_layout(
title_text='Top 100 high-frequency words for titles',
xaxis_title="Words",
yaxis_title="Frequency",
)
fig.show()
### Wordcloud review by abstract
abstract_words = ''
abstract_list_full, abstract_list_filtered = [], []
# iterate through the csv file
for val in soa_df.abstract_proc:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
for tok in tokens:
abstract_list_full.append(tok)
if tok not in stopwords:
abstract_list_filtered.append(tok)
abstract_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 1000, height = 400,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(abstract_words)
# plot the WordCloud image
plt.figure(figsize = (9, 9), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
sorted_tc_dict = sorted(dict(Counter(abstract_list_filtered)).items(), key=lambda kv: kv[1])
abstract_array = np.asarray(sorted_tc_dict)
labels = np.flipud(abstract_array[:,0])
values = np.flipud(abstract_array[:,1]).astype(int)
data = [go.Bar(
x = labels[:100],
y = values[:100]
)]
fig = go.Figure(data=data)
fig.update_layout(xaxis_tickangle=-45)
fig.update_layout(
title_text='Top 100 high-frequency words for abstracts',
xaxis_title="Words",
yaxis_title="Frequency",
)
fig.show()
df_autors = soa_df.loc[:,'a_1':'a_14']
df_yc = soa_df[['año','citas', 'a_1']]
df_authors_full = reduce(lambda left,right: pd.merge(left,right,on=['a_1'],
how='outer'), [df_yc, df_autors])
df_authors_full = df_authors_full.rename(columns={'año': 'year', 'citas': 'cites'})
df_authors_full['tot_authors'] = None
all_authors_list = []
for row in tqdm(range(len(df_authors_full))):
author_row = df_authors_full.loc[row, 'a_1':'a_14'].values.tolist()
#============================================
df_authors_full['tot_authors'].iloc[row] = 14-author_row.count('-')
for el in author_row:
if el!='-':
all_authors_list.append(el)
else:
pass
df_authors_full
0%| | 0/130 [00:00<?, ?it/s]C:\Users\user\anaconda3\envs\py38dane\lib\site-packages\pandas\core\indexing.py:1637: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 130/130 [00:00<00:00, 1481.23it/s]
| year | cites | a_1 | a_2 | a_3 | a_4 | a_5 | a_6 | a_7 | a_8 | a_9 | a_10 | a_11 | a_12 | a_13 | a_14 | tot_authors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014 | 89 | Sacha, D | Stein, M. | Schreck, T | Keim, D. | Deussen, O. | - | - | - | - | - | - | - | - | - | 5 |
| 1 | 2015 | 35 | Stein, M. | Häußler, J. | Jäckle, D. | Janetzko, H. | Schreck, T. | Keim, D. | - | - | - | - | - | - | - | - | 6 |
| 2 | 2013 | 35 | Kumar, G. | - | - | - | - | - | - | - | - | - | - | - | - | - | 1 |
| 3 | 2014 | 64 | Stensland, H. K. | Gaddam, V. R | Tennøe, M. | Helgedagsrud, E. | Næss, M. | Alstad, H. K. | Mortensen, A. | Langseth, R. | Ljødal, S. | Landsverk, Ø. | Griwodz, C. | Halvorsen, P. | Stenhaug, M. | Johansen, D. | 14 |
| 4 | 2014 | 0 | Bond, S. | - | - | - | - | - | - | - | - | - | - | - | - | - | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 125 | 2015 | 2 | Brymer, R. | Holcomb, T. R. | Rodenberg, R. M. | - | - | - | - | - | - | - | - | - | - | - | 3 |
| 126 | 2013 | 33 | Drachen, A. | Schubert, M. | - | - | - | - | - | - | - | - | - | - | - | - | 2 |
| 127 | 2019 | 3 | Yi?it, A. T. | Samak, B. | Kaya, T. | - | - | - | - | - | - | - | - | - | - | - | 3 |
| 128 | 2019 | 13 | Singh, P. | Lamba, P. S. | - | - | - | - | - | - | - | - | - | - | - | - | 2 |
| 129 | 2019 | 80 | Baboota, R. | Kaur, H. | - | - | - | - | - | - | - | - | - | - | - | - | 2 |
130 rows × 17 columns
df_authors_full_yt = df_authors_full[['year', 'tot_authors']]
df_authors_full_yt = df_authors_full_yt.groupby(['year', 'tot_authors']).count()
df_authors_full_yt = df_authors_full_yt.reset_index()
df_authors_full_yt
| year | tot_authors | |
|---|---|---|
| 0 | 2013 | 1 |
| 1 | 2013 | 2 |
| 2 | 2013 | 3 |
| 3 | 2013 | 4 |
| 4 | 2013 | 5 |
| 5 | 2014 | 1 |
| 6 | 2014 | 2 |
| 7 | 2014 | 3 |
| 8 | 2014 | 4 |
| 9 | 2014 | 5 |
| 10 | 2014 | 7 |
| 11 | 2014 | 10 |
| 12 | 2014 | 14 |
| 13 | 2015 | 1 |
| 14 | 2015 | 2 |
| 15 | 2015 | 3 |
| 16 | 2015 | 4 |
| 17 | 2015 | 5 |
| 18 | 2015 | 6 |
| 19 | 2015 | 7 |
| 20 | 2016 | 2 |
| 21 | 2016 | 3 |
| 22 | 2016 | 4 |
| 23 | 2016 | 6 |
| 24 | 2017 | 1 |
| 25 | 2017 | 2 |
| 26 | 2017 | 3 |
| 27 | 2017 | 4 |
| 28 | 2017 | 6 |
| 29 | 2017 | 8 |
| 30 | 2018 | 2 |
| 31 | 2018 | 3 |
| 32 | 2018 | 4 |
| 33 | 2018 | 5 |
| 34 | 2019 | 2 |
| 35 | 2019 | 3 |
| 36 | 2019 | 5 |
| 37 | 2020 | 4 |
| 38 | 2020 | 7 |
| 39 | 2020 | 9 |
| 40 | 2021 | 3 |
df_auth_sum = df_authors_full_yt.groupby(['year']).sum().reset_index().rename(columns={'tot_authors': 'tot'})
df_auth_mean = df_authors_full_yt.groupby(['year']).mean().reset_index().rename(columns={'tot_authors': 'mean_val'})
df_auth_median = df_authors_full_yt.groupby(['year']).median().reset_index().rename(columns={'tot_authors': 'median_val'})
df_auth_max = df_authors_full_yt.groupby(['year']).max().reset_index().rename(columns={'tot_authors': 'max_val'})
df_auth_min = df_authors_full_yt.groupby(['year']).min().reset_index().rename(columns={'tot_authors': 'min_val'})
auth_data_frames = [df_auth_sum, df_auth_mean, df_auth_median, df_auth_max, df_auth_min]
df_auth_merged = reduce(lambda left,right: pd.merge(left,right,on=['year'],
how='outer'), auth_data_frames)
df_auth_merged
| year | tot | mean_val | median_val | max_val | min_val | |
|---|---|---|---|---|---|---|
| 0 | 2013 | 15 | 3.000000 | 3.0 | 5 | 1 |
| 1 | 2014 | 46 | 5.750000 | 4.5 | 14 | 1 |
| 2 | 2015 | 28 | 4.000000 | 4.0 | 7 | 1 |
| 3 | 2016 | 15 | 3.750000 | 3.5 | 6 | 2 |
| 4 | 2017 | 24 | 4.000000 | 3.5 | 8 | 1 |
| 5 | 2018 | 14 | 3.500000 | 3.5 | 5 | 2 |
| 6 | 2019 | 10 | 3.333333 | 3.0 | 5 | 2 |
| 7 | 2020 | 20 | 6.666667 | 7.0 | 9 | 4 |
| 8 | 2021 | 3 | 3.000000 | 3.0 | 3 | 3 |
fig = go.Figure()
year_labels = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
fig.add_trace(go.Bar(x=year_labels,y=list(df_auth_merged.tot), width=0.3, name='Total'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_auth_merged.mean_val), name='Mean'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_auth_merged.median_val), name='Median'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_auth_merged.max_val), name='Max'))
fig.add_trace(go.Scatter(x=year_labels,y=list(df_auth_merged.min_val), name='Min'))
fig.update_layout(
title_text="2013 - 2021: Authors stats",
xaxis_title="Year",
yaxis_title="Total",
legend_title="Stats",
autosize=False,
width=950,
height=400,
margin=dict(l=0, r=0, t=50, b=0),
font=dict(size=11))
fig.show()
letter_counts = Counter(all_authors_list)
df_authors = pd.DataFrame.from_dict(letter_counts, orient='index').rename(columns={'index': 'author', 0: 'count'})
df_authors = df_authors.sort_values(by=['count'], ascending=True).reset_index().rename(columns={'index': 'author', 0: 'count'})
fig = go.Figure(go.Bar(
x=list(df_authors['count']),
y=list(df_authors['author']),
orientation='h'))
fig.update_layout(
title_text="2013 - 2021: Authors stats - Tot. prod.",
xaxis_title="Total",
yaxis_title="Author name")
fig.show()